credit_score <- dbSendQuery(con, "SELECT * FROM credit_score")
credit_score <- dbFetch(credit_score)
for (col in 1:ncol(credit_score)) {
  colnames(credit_score)[col] <- tolower(colnames(credit_score)[col])
}
credit_score <- subset(credit_score, select = -c(index, id))
categories <- c("sex", "education", "marriage", "default")
for (col in categories) {
  credit_score[, col] <- as.factor(credit_score[, col])
}
numerical <- names(subset(credit_score, select = -c(sex, education, marriage, default)))
for (n in numerical) {
  credit_score[, n] <- as.numeric(unlist(credit_score[, n]))
}
head(credit_score, 10)
##    limit_bal sex education marriage age pay_0 pay_2 pay_3 pay_4 pay_5 pay_6
## 1      20000   2         2        1  24     2     2     0     0     0     0
## 2     120000   2         2        2  26     0     2     0     0     0     2
## 3      90000   2         2        2  34     0     0     0     0     0     0
## 4      50000   2         2        1  37     0     0     0     0     0     0
## 5      50000   1         2        1  57     0     0     0     0     0     0
## 6      50000   1         1        2  37     0     0     0     0     0     0
## 7     500000   1         1        2  29     0     0     0     0     0     0
## 8     100000   2         2        2  23     0     0     0     0     0     0
## 9     140000   2         3        1  28     0     0     2     0     0     0
## 10     20000   1         3        2  35     0     0     0     0     0     0
##    bill_amt1 bill_amt2 bill_amt3 bill_amt4 bill_amt5 bill_amt6 pay_amt1
## 1       3913      3102       689         0         0         0        0
## 2       2682      1725      2682      3272      3455      3261        0
## 3      29239     14027     13559     14331     14948     15549     1518
## 4      46990     48233     49291     28314     28959     29547     2000
## 5       8617      5670     35835     20940     19146     19131     2000
## 6      64400     57069     57608     19394     19619     20024     2500
## 7     367965    412023    445007    542653    483003    473944    55000
## 8      11876       380       601       221      -159       567      380
## 9      11285     14096     12108     12211     11793      3719     3329
## 10         0         0         0         0     13007     13912        0
##    pay_amt2 pay_amt3 pay_amt4 pay_amt5 pay_amt6 default
## 1       689        0        0        0        0       1
## 2      1000     1000     1000        0     2000       1
## 3      1500     1000     1000     1000     5000       0
## 4      2019     1200     1100     1069     1000       0
## 5     36681    10000     9000      689      679       0
## 6      1815      657     1000     1000      800       0
## 7     40000    38000    20239    13750    13770       0
## 8       601        0      581     1687     1542       0
## 9         0      432     1000     1000     1000       0
## 10        0        0    13007     1122        0       0

First, I have to admit that this is a synthetical dataset, which means that there are no missing values, outliers, errors or any other mistakes, so these examinations will be skipped.

credit_score$limit_bal <- log(credit_score$limit_bal)
p1 <- ggplot(credit_score, aes(x = limit_bal)) + geom_histogram()
ggplotly(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
credit_score$age <- log(credit_score$age)
p2 <- ggplot(credit_score, aes(x = age)) + geom_histogram()
ggplotly(p2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p3 <- ggplot(credit_score, aes(x=sex, y=limit_bal)) + geom_boxplot()
ggplotly(p3)
p4 <- ggplot(credit_score, aes(x=education, y=limit_bal)) + geom_boxplot()
ggplotly(p4)
p5 <- ggplot(credit_score, aes(x=marriage, y=limit_bal)) + geom_boxplot()
ggplotly(p5)
t <- (table(credit_score$default,credit_score$sex))
t <- as.data.frame(t)
colnames(t) <- c('default', 'sex', 'cnt')
p6 <-
  ggplot(t, aes(x = default, y = cnt, fill = sex)) + geom_bar(stat = 'identity',  position = position_dodge())
ggplotly(p6)
t <- (table(credit_score$default, credit_score$education))
t <- as.data.frame(t)
colnames(t) <- c('default', 'education', 'cnt')

p6 <-
  ggplot(t, aes(x = default, y = cnt, fill=education)) + geom_bar(stat = 'identity',  position = position_dodge())
ggplotly(p6)
t <- (table(credit_score$default, credit_score$marriage))
t <- as.data.frame(t)
colnames(t) <- c('default', 'marriage', 'cnt')

p7 <-
  ggplot(t, aes(x = default, y = cnt, fill=marriage)) + geom_bar(stat = 'identity',  position = position_dodge())
ggplotly(p7)

Let’s test two hypothesis: - Are the mean credit limits (limit_bal) value for two groups default = 0 (didn’t returned the credit) and default = 1 equal to each other? - Are the distributions of the limit_bal for these two groups also equal to each other?

In order to answer these and the following questions I will calculate confidence intervals.

t.test(limit_bal ~ default, data = credit_score)
## 
##  Welch Two Sample t-test
## 
## data:  limit_bal by default
## t = 29.46, df = 10186, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  0.3673776 0.4197521
## sample estimates:
## mean in group 0 mean in group 1 
##        11.75006        11.35649
wilcox.test(limit_bal ~ default, data = credit_score)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  limit_bal by default
## W = 95786286, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

These results are obviously practically significant.

Now, lets test another pair of hypothesis: - Are the mean ages and their distributions for these two groups equal to each other?

t.test(age ~ default, credit_score)
## 
##  Welch Two Sample t-test
## 
## data:  age by default
## t = -1.2343, df = 10171, p-value = 0.2171
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.011595106  0.002634737
## sample estimates:
## mean in group 0 mean in group 1 
##         3.53598         3.54046
wilcox.test(age ~ default, credit_score)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  age by default
## W = 76966880, p-value = 0.3725
## alternative hypothesis: true location shift is not equal to 0

The result we received tells us that statistically, mean ages are different, but from the confidence interval value we can see that this difference is hardly practically signigicant.

Now let’s see if the gender composition for the two groups differ.

good <- filter(credit_score, default == 0)
bad <- filter(credit_score, default == 1)
c(ngoodmen, total_good, nbadmen, total_bad) %<-% c(table(good$sex)[1], sum(table(good$sex)), table(bad$sex)[1], sum(table(bad$sex)))
diffscoreci(ngoodmen, total_good, nbadmen, total_bad, conf.level = 0.95)
## 
## 
## 
## data:  
## 
## 95 percent confidence interval:
##  -0.06057240 -0.03366348

That means that men do not return their credits slightly more often (3-6%) than women.

Now, let’s see if the education level impacts default rate. First, calculate table which will show us the sizes of default and no-default groups for each education level, secondly, let’s see how do these sizes differ from the expected ones, next calculate the value of the statistical criteria.

crosstab <- table(credit_score$education, credit_score$default)
crosstab
##    
##         0     1
##   0    14     0
##   1  8549  2036
##   2 10700  3330
##   3  3680  1237
##   4   116     7
##   5   262    18
##   6    43     8
crosstab - chisq.test(crosstab)$expected
## Warning in chisq.test(crosstab): Chi-squared approximation may be incorrect
##    
##             0         1
##   0    3.0968   -3.0968
##   1  305.4020 -305.4020
##   2 -226.5640  226.5640
##   3 -149.3596  149.3596
##   4   20.2076  -20.2076
##   5   43.9360  -43.9360
##   6    3.2812   -3.2812
chisq.test(crosstab)
## Warning in chisq.test(crosstab): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  crosstab
## X-squared = 163.22, df = 6, p-value < 2.2e-16
assocstats(crosstab)
##                     X^2 df P(> X^2)
## Likelihood Ratio 184.71  6        0
## Pearson          163.22  6        0
## 
## Phi-Coefficient   : NA 
## Contingency Coeff.: 0.074 
## Cramer's V        : 0.074

Finally, let’s see if the marriage category impacts the default category.

marriage_crosstab <- table(credit_score$marriage, credit_score$default)
marriage_crosstab
##    
##         0     1
##   0    49     5
##   1 10453  3206
##   2 12623  3341
##   3   239    84
assocstats(marriage_crosstab)
##                     X^2 df   P(> X^2)
## Likelihood Ratio 36.609  3 5.5663e-08
## Pearson          35.662  3 8.8259e-08
## 
## Phi-Coefficient   : NA 
## Contingency Coeff.: 0.034 
## Cramer's V        : 0.034

For both variables (education and marriage) we see that they statitically significant impact the default category. However, the contigency coefficients (which tells us how strong the features are correlated) are relatively small.